knitr::opts_chunk$set(echo = TRUE)

Dataset: Mass Shooting in US Dataset Exploration

Mass Shootings in the United States of America (1966-2017) The US has witnessed 398 mass shootings in last 50 years that resulted in 1996 deaths and 2488 injured. The latest and the worst mass shooting of October 2, 2017 killed 58 and injured 515 so far. The number of people injured in this attack is more than the number of people injured in all mass shootings of 2015 and 2016 combined. The average number of mass shootings per year is 7 for the last 50 years that would claim 39 lives and 48 injured per year.

Q2a) Analyzing Data

installing necessary modules

suppressWarnings(suppressMessages(library(data.table)))
suppressWarnings(suppressMessages(library(readr)))
suppressWarnings(suppressMessages(library(plotly)))
suppressWarnings(suppressMessages(library(ggplot2)))
suppressWarnings(suppressMessages(library(maps)))
suppressWarnings(suppressMessages(library(tm)))
suppressWarnings(suppressMessages(library(wordcloud)))

Loading data from source. Extract the year and month from the date and combine the variables of gender

MS_dataset<- read_csv("C:/Users/riyac/Documents/Mass_Shootings_Dataset_Ver_5.csv", col_types = cols(Date = col_date(format = "%m/%d/%Y")))
head(MS_dataset, 5)
## # A tibble: 5 x 21
##    `S#`                               Title               Location
##   <int>                               <chr>                  <chr>
## 1     1          Texas church mass shooting Sutherland Springs, TX
## 2     2 Walmart shooting in suburban Denver           Thornton, CO
## 3     3     Edgewood businees park shooting           Edgewood, MD
## 4     4       Las Vegas Strip mass shooting          Las Vegas, NV
## 5     5          San Francisco UPS shooting      San Francisco, CA
## # ... with 18 more variables: Date <date>, `Incident Area` <chr>,
## #   `Open/Close Location` <chr>, Target <chr>, Cause <chr>, Summary <chr>,
## #   Fatalities <int>, Injured <int>, `Total victims` <int>, `Policeman
## #   Killed` <int>, Age <dbl>, `Employeed (Y/N)` <int>, `Employed
## #   at` <chr>, `Mental Health Issues` <chr>, Race <chr>, Gender <chr>,
## #   Latitude <dbl>, Longitude <dbl>
tail(MS_dataset, 5)
## # A tibble: 5 x 21
##    `S#`                          Title               Location       Date
##   <int>                          <chr>                  <chr>     <date>
## 1   319 Clara Barton Elementary School      Chicago, Illinois 1974-01-17
## 2   320   New Orleans Police Shootings New Orleans, Louisiana 1972-12-31
## 3   321            St. Aloysius Church    Spokane, Washington 1971-11-11
## 4   322     Rose-Mar College of Beauty          Mesa, Arizona 1966-11-12
## 5   323  University of Texas at Austin          Austin, Texas 1966-08-01
## # ... with 17 more variables: `Incident Area` <chr>, `Open/Close
## #   Location` <chr>, Target <chr>, Cause <chr>, Summary <chr>,
## #   Fatalities <int>, Injured <int>, `Total victims` <int>, `Policeman
## #   Killed` <int>, Age <dbl>, `Employeed (Y/N)` <int>, `Employed
## #   at` <chr>, `Mental Health Issues` <chr>, Race <chr>, Gender <chr>,
## #   Latitude <dbl>, Longitude <dbl>
MS_dataset <- data.table(MS_dataset)
summary(MS_dataset)
##        S#           Title             Location        
##  Min.   :  1.0   Length:323         Length:323        
##  1st Qu.: 81.5   Class :character   Class :character  
##  Median :162.0   Mode  :character   Mode  :character  
##  Mean   :162.0                                        
##  3rd Qu.:242.5                                        
##  Max.   :323.0                                        
##                                                       
##       Date            Incident Area      Open/Close Location
##  Min.   :1966-08-01   Length:323         Length:323         
##  1st Qu.:2001-01-15   Class :character   Class :character   
##  Median :2013-11-01   Mode  :character   Mode  :character   
##  Mean   :2007-11-16                                         
##  3rd Qu.:2015-12-02                                         
##  Max.   :2017-11-05                                         
##                                                             
##     Target             Cause             Summary            Fatalities    
##  Length:323         Length:323         Length:323         Min.   : 0.000  
##  Class :character   Class :character   Class :character   1st Qu.: 1.000  
##  Mode  :character   Mode  :character   Mode  :character   Median : 3.000  
##                                                           Mean   : 4.437  
##                                                           3rd Qu.: 5.500  
##                                                           Max.   :59.000  
##                                                                           
##     Injured        Total victims    Policeman Killed      Age         
##  Min.   :  0.000   Min.   :  3.00   Min.   :0.0000   Min.   :   0.00  
##  1st Qu.:  1.000   1st Qu.:  4.00   1st Qu.:0.0000   1st Qu.:  20.50  
##  Median :  3.000   Median :  5.00   Median :0.0000   Median :  34.00  
##  Mean   :  6.176   Mean   : 10.26   Mean   :0.1293   Mean   :  78.15  
##  3rd Qu.:  5.000   3rd Qu.:  9.00   3rd Qu.:0.0000   3rd Qu.:  42.00  
##  Max.   :527.000   Max.   :585.00   Max.   :5.0000   Max.   :1932.00  
##                                     NA's   :6        NA's   :144      
##  Employeed (Y/N)  Employed at        Mental Health Issues
##  Min.   :0.0000   Length:323         Length:323          
##  1st Qu.:0.0000   Class :character   Class :character    
##  Median :1.0000   Mode  :character   Mode  :character    
##  Mean   :0.6269                                          
##  3rd Qu.:1.0000                                          
##  Max.   :1.0000                                          
##  NA's   :256                                             
##      Race              Gender             Latitude       Longitude      
##  Length:323         Length:323         Min.   :21.33   Min.   :-161.79  
##  Class :character   Class :character   1st Qu.:33.57   1st Qu.:-110.21  
##  Mode  :character   Mode  :character   Median :36.44   Median : -88.12  
##                                        Mean   :37.23   Mean   : -94.43  
##                                        3rd Qu.:41.48   3rd Qu.: -81.70  
##                                        Max.   :60.79   Max.   : -69.71  
##                                        NA's   :20      NA's   :20
MS_dataset[,Month:=as.factor(month(Date))]
MS_dataset[,Year_n:=as.numeric(year(Date))]
MS_dataset[,Year:=as.factor(year(Date))]

MS_dataset[Gender=='M',Gender:="Male"]
MS_dataset[Gender=='M/F',Gender:="Male/Female"]
MS_dataset[is.na(Gender),Gender:="Unknown"]
MS_dataset[,Gender:=as.factor(Gender)]

2.1 Number of total victims by years

plot_ly(data = MS_dataset
        ,type = 'bar'
        ,mode = 'markers' 
        ,hoverinfo = 'text'
        ,x = ~Year
        ,y = ~ `Total victims` 
        ,color = 'Red'
        ,alpha = 0.9
        ,text = ~paste(
          'Fatalities : ', Fatalities
          ,'\n Injured : ', Injured
        )) %>% 
  layout(title = "Number of Total victims by years"
         , xaxis = list(title = "")
         , yaxis = list(title = "Number of victims"))

2.2 Number of incidents by years

plot_ly(data = MS_dataset
        ,type = 'histogram'
        ,mode = 'markers'
        ,x = ~Year
        ,alpha = 0.9) %>% 
  layout(title = "Number of incidents by years"
         , xaxis = list(title = "")
         , yaxis = list(title = "Number of incidents"))

2.3 Number of incidents by month

plot_ly(data = MS_dataset
        ,type = 'histogram'
        ,mode = 'markers'
        ,x = ~Month
        ,alpha = 0.9) %>% 
  layout(title = "Number of incidents by month"
         , xaxis = list(title = "Month")
         , yaxis = list(title = "Number of incidents"))

2.4 Pie chart by Mental Health Issues of the shooter

MS_dataset[`Mental Health Issues`=="unknown",`Mental Health Issues`:="Unknown"]

# set colors for first pie chart
colors_pie1 <- c('rgb(211,94,96)', 'rgb(128,133,133)', 'rgb(144,103,167)', 'rgb(171,104,87)', 'rgb(114,147,203)')

plot_ly(data = MS_dataset[,.(`Total victims`,`Mental Health Issues`)]
        ,type = 'pie'
        ,labels = ~`Mental Health Issues`
        ,values = ~`Total victims`
        ,textposition = 'inside'
        ,insidetextfont = list(color = '#FFFFFF')
        ,marker = list(colors = colors_pie1,
                       line = list(color = '#FFFFFF', width = 1)))%>%
  layout(title = "Mental Health Issues",
         showlegend = T)
MS_dataset$State <- sapply(MS_dataset$Location, function(x){
  temp <- strsplit(x, split = ",")
  sapply(temp, function(y){y[2]
    
  })
})

2.5 Pie chart with Number of incidents by States

plot_ly(data = MS_dataset[!is.na(State),.('Number of incidents'= uniqueN(`S#`)),by=State]
        ,type = 'pie'
        ,labels = ~State
        ,values = ~`Number of incidents`
        ,textposition = 'inside'
        ,insidetextfont = list(color = '#FFFFFF')
        ,marker = list(colors = colors_pie1,
                       line = list(color = '#FFFFFF', width = 1)))%>%
  layout(title = "Number of incidents by States",
         showlegend = T)

2.6 Bar plot with Total victims by Years and Race

# Clearing and merging data in the Race field
MS_dataset[Race=="unclear",Race:="Unknown"]
MS_dataset[is.na(Race),Race:="White"]

MS_dataset[Race=="Black American or African American" 
           | Race=="black"
           | Race=="Black American or African American/Unknown"
           ,Race:="Black"]

MS_dataset[Race=="White American or European American"
           | Race=="White American or European American/Some other Race" 
           | Race=="white"
           ,Race:="White"]

MS_dataset[Race=="Asian American"
           | Race=="Asian American/Some other race" 
           ,Race:="Asian"]

MS_dataset[Race=="Unknown",Race:="Other"]
MS_dataset[Race=="Two or more races",Race:="Other"]
MS_dataset[Race=="Some other race",Race:="Other"]
MS_dataset[Race=="Native American or Alaska Native",Race:="Native American"]
plot_ly(data = MS_dataset[,.('Total victims'= sum(`Total victims`)),by=.(Race,Year)]
        ,type = 'bar'
        ,mode = 'markers'
        ,x = ~Year
        ,y = ~`Total victims`
        ,color =~Race
        ,alpha = 0.9) %>% 
  layout(title = "Total victims by Race"
         , showlegend = T
         , barmode = 'stack'
         , position = 1
         , xaxis = list(title = "")
         , yaxis = list(title = "")
         , legend = list(x = 0, y = 1)
         , hovermode = 'compare')

2.7 Total victims & Fatalities on US map

#load us map data
all_states <- map_data("state")
#plot all states with ggplot
p <- ggplot()
p <- p + geom_polygon(data=all_states, aes(x=long, y=lat, group = group),colour="black", fill="white")

p <- 
  p + geom_point(data=MS_dataset[Longitude >=-140,]
                 , aes(x=Longitude, y=Latitude
                       ,size = `Total victims`
                       ,color = Fatalities)
                 ,alpha = 0.6) + 
  scale_color_gradient(low = "red", high = "black") + 
  ggtitle("Total victims & Fatalities on US map")


ggplotly(
  p
)

Q2b) How was data made usable?

  1. Clearing and merging data in the Race field
  2. Location was split into State and City
  3. No missing values for Location, Latitude and Longitude